.Rmd}install.packages("put_package_name_here") into the consoleinstall.packages("tidyverse")library("put_package_name_here") into your code chunklibrary("tidyverse")# Load Relevant Packages
library(tidyverse) # piping `%>%`, plotting, reading data
library(skimr) # exploratory data summary
library(naniar) # exploratory plots
library(kableExtra) # tables
library(lubridate) # for date variables
library(plotly)
# Extension
# Try to save time by installing a package to install packages!
# install.packages("pacman")
# pacman::p_load(tidyverse, skimr, naniar, kableExtra, lubridate, plotly)
If the data is online:
This is easy for datasets that are not too large & already hosted online!
If the data is a local file:
If you are knitting your document: it will automatically look for data in the same folder as your R file. So you should have no dramas (per step 1.).
If you are running a section of code: you will need to specify which folder the data is in. The best way to do this is by following these menu options: Session Menu >> Set Working Directory >> To Source File Location.
read_csv() function which comes from the tidyverse package.# Load Data
data = read_csv("https://www.maths.usyd.edu.au/u/UG/OL/OLEO5605/r/NYC_Drinking_Water.csv")
#data = read_csv("Drinking_Water_Quality_Distribution_Monitoring_Data.csv")
# Glimpse Function [From tidyverse package]
data %>% glimpse()
## Rows: 72,709
## Columns: 10
## $ `Sample Date` <chr> "01/01/2015", "01/01/2015", "01…
## $ `Sample Time` <time> 12:19:00, 11:15:00, 10:09:00, …
## $ `Sample Site` <chr> "1S07", "1S04", "1S03A", "1S03B…
## $ `Sample class` <chr> "Operational", "Operational", "…
## $ Location <chr> "SS - Shaft 7 of City Tunnel No…
## $ `Residual Free Chlorine (mg/L)` <dbl> 0.58, 0.71, 0.79, 0.77, 0.74, 0…
## $ `Turbidity (NTU)` <dbl> 0.96, 0.94, 0.93, 0.93, 0.95, 1…
## $ `Fluoride (mg/L)` <dbl> 0.79, 0.80, 0.79, 0.80, NA, NA,…
## $ `Coliform (Quanti-Tray) (MPN /100mL)` <chr> "<1", "<1", "<1", "<1", "<1", "…
## $ `E.coli(Quanti-Tray) (MPN/100mL)` <chr> "<1", "<1", "<1", "<1", "<1", "…
# Skim Function [From skimr package]
data %>% skim()
| Name | Piped data |
| Number of rows | 72709 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| difftime | 1 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Sample Date | 0 | 1 | 10 | 10 | 0 | 1673 | 0 |
| Sample Site | 0 | 1 | 4 | 5 | 0 | 398 | 0 |
| Sample class | 0 | 1 | 9 | 20 | 0 | 7 | 0 |
| Location | 0 | 1 | 29 | 152 | 0 | 1263 | 0 |
| Coliform (Quanti-Tray) (MPN /100mL) | 0 | 1 | 1 | 6 | 0 | 47 | 0 |
| E.coli(Quanti-Tray) (MPN/100mL) | 0 | 1 | 1 | 2 | 0 | 3 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Sample Time | 2 | 1 | 17520 secs | 57780 secs | 10:10:00 | 463 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Residual Free Chlorine (mg/L) | 3 | 1.00 | 0.58 | 0.21 | -9.99 | 0.45 | 0.59 | 0.72 | 2.20 | ▁▁▁▁▇ |
| Turbidity (NTU) | 506 | 0.99 | 0.74 | 0.28 | 0.07 | 0.62 | 0.73 | 0.86 | 33.80 | ▇▁▁▁▁ |
| Fluoride (mg/L) | 63336 | 0.13 | 0.71 | 0.06 | 0.03 | 0.69 | 0.71 | 0.73 | 0.89 | ▁▁▁▇▇ |
# Summary Function [From base package -- preinstalled!]
data %>% summary()
## Sample Date Sample Time Sample Site Sample class
## Length:72709 Length:72709 Length:72709 Length:72709
## Class :character Class1:hms Class :character Class :character
## Mode :character Class2:difftime Mode :character Mode :character
## Mode :numeric
##
##
##
## Location Residual Free Chlorine (mg/L) Turbidity (NTU)
## Length:72709 Min. :-9.9900 Min. : 0.0700
## Class :character 1st Qu.: 0.4500 1st Qu.: 0.6200
## Mode :character Median : 0.5900 Median : 0.7300
## Mean : 0.5845 Mean : 0.7385
## 3rd Qu.: 0.7200 3rd Qu.: 0.8600
## Max. : 2.2000 Max. :33.8000
## NA's :3 NA's :506
## Fluoride (mg/L) Coliform (Quanti-Tray) (MPN /100mL)
## Min. :0.03 Length:72709
## 1st Qu.:0.69 Class :character
## Median :0.71 Mode :character
## Mean :0.71
## 3rd Qu.:0.73
## Max. :0.89
## NA's :63336
## E.coli(Quanti-Tray) (MPN/100mL)
## Length:72709
## Class :character
## Mode :character
##
##
##
##
# vis_miss function [From visdat or naniar packages]
vis_miss(data, warn_large_data = FALSE)
data %>%
select(where(is.numeric)) %>%
pivot_longer(everything(),
names_to = "Variable",
values_to = "Value") %>%
group_by(Variable) %>%
summarise(Mean = mean(Value, na.rm = TRUE),
Median = median(Value, na.rm = TRUE)) %>%
kable() %>% # putting into a table
kable_styling(bootstrap_options = c("hover")) # making table look good
## `summarise()` ungrouping output (override with `.groups` argument)
| Variable | Mean | Median |
|---|---|---|
| Fluoride (mg/L) | 0.7144069 | 0.71 |
| Residual Free Chlorine (mg/L) | 0.5844564 | 0.59 |
| Turbidity (NTU) | 0.7385309 | 0.73 |
p = data %>%
select(where(is.numeric)) %>%
pivot_longer(everything(),
names_to = "Variable",
values_to = "Value") %>%
filter(!is.na(Value)) %>%
ggplot(aes(x = Value)) +
facet_wrap(~ Variable, scales = "free_y") +
geom_histogram(bins = 30, fill = "lightblue", color = "darkblue") +
labs(y = "Frequency") +
theme_bw()
ggplotly(p)
p = data %>%
select(where(is.numeric)) %>%
pivot_longer(everything(),
names_to = "Variable",
values_to = "Value") %>%
filter(!is.na(Value)) %>%
ggplot(aes(y = Value)) +
facet_wrap(~ Variable, scales = "free_y") +
geom_boxplot(fill = "lightblue", color = "darkblue") +
theme_bw() +
theme(strip.text.x = element_text(size = 8))
p
# See what state things are currently in
data %>% glimpse()
## Rows: 72,709
## Columns: 10
## $ `Sample Date` <chr> "01/01/2015", "01/01/2015", "01…
## $ `Sample Time` <time> 12:19:00, 11:15:00, 10:09:00, …
## $ `Sample Site` <chr> "1S07", "1S04", "1S03A", "1S03B…
## $ `Sample class` <chr> "Operational", "Operational", "…
## $ Location <chr> "SS - Shaft 7 of City Tunnel No…
## $ `Residual Free Chlorine (mg/L)` <dbl> 0.58, 0.71, 0.79, 0.77, 0.74, 0…
## $ `Turbidity (NTU)` <dbl> 0.96, 0.94, 0.93, 0.93, 0.95, 1…
## $ `Fluoride (mg/L)` <dbl> 0.79, 0.80, 0.79, 0.80, NA, NA,…
## $ `Coliform (Quanti-Tray) (MPN /100mL)` <chr> "<1", "<1", "<1", "<1", "<1", "…
## $ `E.coli(Quanti-Tray) (MPN/100mL)` <chr> "<1", "<1", "<1", "<1", "<1", "…
# Data Cleaning
## Changing type of `Sample Date`
# Note: mdy from [From tidyverse package]
data = data %>%
mutate(`Sample Date` = mdy(`Sample Date`))
# Adding additional time columns
data = data %>%
mutate(`Week of Year` = week(`Sample Date`),
`Weekday` = wday(`Sample Date`),
`Month Number` = month(`Sample Date`),
`Hour` = hour(`Sample Time`))
# Giving Month Name an Order
data = data %>%
mutate(`Month` = factor(month.name[`Month Number`],
levels = month.name))
# Converting Categorical Variables to Factors
data = data %>%
mutate(across(where(is_character) & !c(Location, `Sample Site`),
as_factor))
# Drop NA -- Q: Do you think this is appropriate?
# data = data %>%
# drop_na()
# Check we're happy with cleaned data
data %>% glimpse()
## Rows: 72,709
## Columns: 15
## $ `Sample Date` <date> 2015-01-01, 2015-01-01, 2015-0…
## $ `Sample Time` <time> 12:19:00, 11:15:00, 10:09:00, …
## $ `Sample Site` <chr> "1S07", "1S04", "1S03A", "1S03B…
## $ `Sample class` <fct> Operational, Operational, Opera…
## $ Location <chr> "SS - Shaft 7 of City Tunnel No…
## $ `Residual Free Chlorine (mg/L)` <dbl> 0.58, 0.71, 0.79, 0.77, 0.74, 0…
## $ `Turbidity (NTU)` <dbl> 0.96, 0.94, 0.93, 0.93, 0.95, 1…
## $ `Fluoride (mg/L)` <dbl> 0.79, 0.80, 0.79, 0.80, NA, NA,…
## $ `Coliform (Quanti-Tray) (MPN /100mL)` <fct> <1, <1, <1, <1, <1, <1, <1, <1,…
## $ `E.coli(Quanti-Tray) (MPN/100mL)` <fct> <1, <1, <1, <1, <1, <1, <1, <1,…
## $ `Week of Year` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Weekday <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5…
## $ `Month Number` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Hour <int> 12, 11, 10, 10, 9, 8, 9, 11, 7,…
## $ Month <fct> January, January, January, Janu…
data %>% summary()
## Sample Date Sample Time Sample Site
## Min. :2015-01-01 Length:72709 Length:72709
## 1st Qu.:2016-04-03 Class1:hms Class :character
## Median :2017-06-26 Class2:difftime Mode :character
## Mean :2017-06-14 Mode :numeric
## 3rd Qu.:2018-09-11
## Max. :2019-10-31
##
## Sample class Location Residual Free Chlorine (mg/L)
## Operational :27733 Length:72709 Min. :-9.9900
## Compliance :44289 Class :character 1st Qu.: 0.4500
## Resample_Compliance : 472 Mode :character Median : 0.5900
## Resample_Operational: 1 Mean : 0.5845
## Entry Point : 168 3rd Qu.: 0.7200
## Re-Sample : 33 Max. : 2.2000
## Op-resample : 13 NA's :3
## Turbidity (NTU) Fluoride (mg/L) Coliform (Quanti-Tray) (MPN /100mL)
## Min. : 0.0700 Min. :0.03 <1 :72436
## 1st Qu.: 0.6200 1st Qu.:0.69 1 : 102
## Median : 0.7300 Median :0.71 >200.5 : 33
## Mean : 0.7385 Mean :0.71 2 : 28
## 3rd Qu.: 0.8600 3rd Qu.:0.73 3.1 : 20
## Max. :33.8000 Max. :0.89 4.2 : 9
## NA's :506 NA's :63336 (Other): 81
## E.coli(Quanti-Tray) (MPN/100mL) Week of Year Weekday
## <1:72704 Min. : 1.00 Min. :1.000
## - : 1 1st Qu.:14.00 1st Qu.:2.000
## 1 : 4 Median :26.00 Median :4.000
## Mean :26.15 Mean :4.019
## 3rd Qu.:38.00 3rd Qu.:6.000
## Max. :53.00 Max. :7.000
##
## Month Number Hour Month
## Min. : 1.000 Min. : 4.00 August : 6972
## 1st Qu.: 4.000 1st Qu.: 9.00 July : 6851
## Median : 6.000 Median :10.00 May : 6796
## Mean : 6.422 Mean : 9.68 June : 6630
## 3rd Qu.: 9.000 3rd Qu.:11.00 September: 6578
## Max. :12.000 Max. :16.00 January : 6521
## NA's :2 (Other) :32361
top_10_turbidity = data %>%
arrange(desc(`Turbidity (NTU)`)) %>%
select(`Sample Date`, `Turbidity (NTU)`) %>%
head(10)
top_10_turbidity %>%
kable(caption = "Top 10 Turbidity readings") %>%
kable_styling(bootstrap_options = c("hover"))
| Sample Date | Turbidity (NTU) |
|---|---|
| 2018-01-16 | 33.80 |
| 2017-05-19 | 27.10 |
| 2019-05-17 | 14.10 |
| 2017-12-15 | 6.97 |
| 2018-01-23 | 6.97 |
| 2017-11-27 | 6.68 |
| 2017-12-13 | 6.29 |
| 2016-05-16 | 6.04 |
| 2017-12-29 | 5.96 |
| 2018-02-21 | 5.89 |
The highest Turbidity rating was on 2018-01-16 with a reading of 33.8.
class_medians = data %>%
group_by(`Sample class`) %>%
summarise(med_chlorine = median(`Residual Free Chlorine (mg/L)`, na.rm = TRUE),
med_turbidity = median(`Turbidity (NTU)`, na.rm = TRUE),
med_flouride = median(`Fluoride (mg/L)`, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
class_medians %>%
kable() %>%
kable_styling(bootstrap_options = c("hover"))
| Sample class | med_chlorine | med_turbidity | med_flouride |
|---|---|---|---|
| Operational | 0.69 | 0.75 | 0.71 |
| Compliance | 0.52 | 0.72 | 0.71 |
| Resample_Compliance | 0.47 | 0.70 | NA |
| Resample_Operational | 0.75 | 0.98 | NA |
| Entry Point | 0.63 | 0.72 | 0.72 |
| Re-Sample | 0.39 | 0.67 | NA |
| Op-resample | 0.73 | 0.70 | 0.72 |
p = data %>%
filter(`Sample class` == "Entry Point" |
`Sample class` == "Operational") %>%
ggplot(aes(x = `Sample class`,
y = `Residual Free Chlorine (mg/L)`)) +
geom_boxplot(fill = "lightblue", color = "darkblue") +
labs(title = "Residual Free Chlorine (mg/L) for different sample classes") +
theme_bw()
ggplotly(p)
site_medians_wide = data %>%
group_by(`Sample Site`) %>%
summarise(med_chlorine = median(`Residual Free Chlorine (mg/L)`, na.rm = TRUE),
med_turbidity = median(`Turbidity (NTU)`, na.rm = TRUE),
med_fluoride = median(`Fluoride (mg/L)`, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
# Tidy Way
site_medians_long = site_medians_wide %>%
pivot_longer(!c(`Sample Site`),
names_to = "Median Type",
values_to = "Median Value")
max_min_median_sites = site_medians_long %>%
group_by(`Median Type`) %>%
summarise(
Min_Val = min(`Median Value`, na.rm = TRUE),
Min_Site = paste(`Sample Site`[which(`Median Value` == Min_Val)],
collapse = ", "),
Max_Val = max(`Median Value`, na.rm = TRUE),
Max_Site = paste(`Sample Site`[which(`Median Value` == Max_Val)],
collapse = ", ")
)
## `summarise()` ungrouping output (override with `.groups` argument)
max_min_median_sites %>%
kable() %>%
kable_styling(bootstrap_options = c("hover"))
| Median Type | Min_Val | Min_Site | Max_Val | Max_Site |
|---|---|---|---|---|
| med_chlorine | 0.11 | 77750 | 0.91 | 1S03A |
| med_fluoride | 0.69 | 32350 | 0.81 | 1S04 |
| med_turbidity | 0.45 | 3SC26 | 1.03 | 51900 |
# Non-Tidy Way -- copy code for each Median Type
# site_medians_wide %>%
# select(`Sample Site`, med_turbidity) %>%
# arrange(desc(med_turbidity)) %>%
# filter(row_number() %in% c(1, n()))
#
# site_medians_wide %>%
# select(`Sample Site`, med_fluoride) %>%
# arrange(desc(med_fluoride)) %>%
# filter(row_number() %in% c(1, n()))
#
# site_medians_wide %>%
# select(`Sample Site`, med_chlorine) %>%
# arrange(desc(med_chlorine)) %>%
# filter(row_number() %in% c(1, n()))
site_names = max_min_median_sites %>%
filter(`Median Type` == "med_turbidity") %>%
select(`Min_Site`, `Max_Site`) %>%
t()
p = data %>%
filter(`Sample Site` %in% site_names) %>%
ggplot(aes(x = `Turbidity (NTU)`, fill = `Sample Site`)) +
geom_histogram(bins = 30, color = "white") +
scale_fill_manual(values = c("lightblue", "darkblue")) +
theme_bw() +
labs(y = "Frequency")
ggplotly(p)
p = data %>%
filter(`Sample Site` %in% site_names) %>%
ggplot(aes(x = `Sample Date`, y = `Turbidity (NTU)`, color = `Sample Site`))+
geom_line() +
scale_color_manual(values = c("lightblue", "darkblue")) +
theme_bw()
ggplotly(p)
p = data %>%
group_by(`Sample Date`) %>%
summarise(med_chlorine = median(`Residual Free Chlorine (mg/L)`, na.rm = TRUE),
med_turbidity = median(`Turbidity (NTU)`, na.rm = TRUE),
med_fluoride = median(`Fluoride (mg/L)`, na.rm = TRUE)) %>%
pivot_longer(!c(`Sample Date`),
names_to = "Median Type",
values_to = "Median Value") %>%
ggplot(aes(x = `Sample Date`, y = `Median Value`, colour = `Median Type`)) +
geom_line() +
scale_color_manual(values = c("#ab5f54", "lightblue", "darkblue")) +
theme_bw()
## `summarise()` ungrouping output (override with `.groups` argument)
ggplotly(p)
p = data %>%
group_by(Month) %>%
ggplot(aes(x = Month, y = log(`Turbidity (NTU)`))) +
geom_boxplot(fill = "lightblue", color = "darkblue") +
theme_bw() +
theme(text = element_text(size = 10),
axis.text.x = element_text(angle = 45, vjust = -0.5)) +
labs(y = "Log of Turbidity (NTU)")
ggplotly(p)
What questions do you have about the data? Insert your analysis here.
# Code...